{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import jieba\n",
    "import re\n",
    "from gensim.models import word2vec "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分词\n",
    "def tokenizer(text): \n",
    "    return [word for word in jieba.lcut(text) if word not in stop_words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去停用词\n",
    "def get_stop_words():\n",
    "    file_object = open('data/stopwords.txt',encoding='utf-8')\n",
    "    stop_words = []\n",
    "    for line in file_object.readlines():\n",
    "        line = line[:-1]\n",
    "        line = line.strip()\n",
    "        stop_words.append(line)\n",
    "    return stop_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>备胎是硬伤！</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>要说不满意的话，那就是动力了，1.5自然吸气发动机对这款车有种小马拉大车的感觉。如今天气这么...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>油耗显示13升还多一点，希望慢慢下降。没有倒车雷达真可恨</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>空调不太凉，应该是小问题。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1、后排座椅不能平放；2、科技感不强，还不如百万帝豪，最希望增加车联网的车机。像你好博越一样...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   index  label                                               text\n",
       "0      0      0                                             备胎是硬伤！\n",
       "1      1      0  要说不满意的话，那就是动力了，1.5自然吸气发动机对这款车有种小马拉大车的感觉。如今天气这么...\n",
       "2      2      0                       油耗显示13升还多一点，希望慢慢下降。没有倒车雷达真可恨\n",
       "3      3      0                                      空调不太凉，应该是小问题。\n",
       "4      4      0  1、后排座椅不能平放；2、科技感不强，还不如百万帝豪，最希望增加车联网的车机。像你好博越一样..."
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "traindata = pd.read_csv('data/train.tsv', sep='\\t')\n",
    "validata = pd.read_csv('data/validation.tsv', sep='\\t')\n",
    "traindata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "totaldata = pd.concat([traindata, validata])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "totaldata.to_csv('data/text.tsv', columns=['text'], index=0)\n",
    "text = pd.read_csv('data/text.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['!', '\"', '#', '$', '%']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stop_words = get_stop_words()\n",
    "stop_words[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 2.259 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['备胎', '硬伤'], ['要说', '满意', '动力', '1.5', '自然', '吸气', '发动机', '这款', '车', '有种', '小马拉', '大车', '感觉', '天气', '热', '上路', '肯定', '得开', '空调', '开', '动力', '感觉', '不给力', '空调', '制冷', '效果', '不错'], ['油耗', '显示', '13', '升', '多一点', '希望', '慢慢', '下降', '倒车', '雷达', '真', '可恨'], ['空调', '不太凉'], ['1', '后排', '座椅', '平放', '2', '科技', '感不强', '百万', '帝豪', '希望', '增加', '车', '联网', '车机', '你好', '博越', '3', '全景', '摄像头', '晚上', '用处']]\n"
     ]
    }
   ],
   "source": [
    "text_cut = []\n",
    "for row in text.itertuples():\n",
    "    seg = tokenizer(row[1])\n",
    "    text_cut.append(seg)\n",
    "print(text_cut[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['备胎 硬伤'], ['要说 满意 动力 1.5 自然 吸气 发动机 这款 车 有种 小马拉 大车 感觉 天气 热 上路 肯定 得开 空调 开 动力 感觉 不给力 空调 制冷 效果 不错'], ['油耗 显示 13 升 多一点 希望 慢慢 下降 倒车 雷达 真 可恨'], ['空调 不太凉'], ['1 后排 座椅 平放 2 科技 感不强 百万 帝豪 希望 增加 车 联网 车机 你好 博越 3 全景 摄像头 晚上 用处']]\n"
     ]
    }
   ],
   "source": [
    "text_concat=[]\n",
    "for seg in text_cut:\n",
    "    seg_concat=[\" \".join(word for word in seg)]\n",
    "    text_concat.append(seg_concat)\n",
    "print(text_concat[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = pd.DataFrame(data=text_concat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus.to_csv('data/corpus.tsv', header=0, index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = word2vec.LineSentence('data/corpus.tsv')\n",
    "model = word2vec.Word2Vec(sentences, min_count=5)# 待考虑"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'备胎'\t'硬伤'\t0.52\n",
      "'车'\t'用处'\t0.27\n",
      "'百万'\t'空调'\t0.11\n"
     ]
    }
   ],
   "source": [
    "pairs = [\n",
    "    ('备胎', '硬伤'),  \n",
    "    ('车', '用处'),   \n",
    "    ('百万', '空调'), \n",
    "]\n",
    "for w1, w2 in pairs:\n",
    "    print('%r\\t%r\\t%.2f' % (w1, w2,model.wv.similarity(w1, w2)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "vector = model.wv['车']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vectors = model.wv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save(\"comment.model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_model = word2vec.Word2Vec.load('comment.model')\n",
    "new_model.wv.save_word2vec_format('data/myvector.vector', binary=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "oldHeight": 172.65322200000003,
   "position": {
    "height": "40px",
    "left": "670.083px",
    "right": "20px",
    "top": "94px",
    "width": "418.181px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "varInspector_section_display": "none",
   "window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
